# set global knit optionsknitr::opts_chunk$set(echo = T,eval = T,error = F,warning = F,message = F,cache = T)# suppress scientific notationoptions(scipen=999)# list of required packagespackages <-c( #"SIN", # this package was removed from the CRAN repository"MASS", "dplyr", "tidyr", "purrr", "extraDistr", "ggplot2", "loo", "bridgesampling", "brms", "bayesplot", "tictoc", "hypr", "bcogsci", "papaja", "grid", "kableExtra", "gridExtra", "lme4", "cowplot", "pdftools", "cmdstanr", "rootSolve", "rstan" )# NB: if you haven't already installed bcogsci through devtools, it won't be loaded## Now load or install & load allpackage.check <-lapply( packages,FUN =function(x) {if (!require(x, character.only =TRUE)) {install.packages(x, dependencies =TRUE)library(x, character.only =TRUE) } })# this is also required, taken from the textbook## Save compiled models:rstan_options(auto_write =FALSE)## Parallelize the chains using all the cores:options(mc.cores = parallel::detectCores())# To solve some conflicts between packagesselect <- dplyr::selectextract <- rstan::extract
1 Chapter 5 - Bayesian hierachical models
1.1 Exercise 5.1 A hierarchical model (normal likelihood) of cognitive load on pupil .
As in section 4.1, we focus on the effect of cognitive load on pupil , but this time we look at all the subjects of Wahn et al. (2016):
You should be able to now fit a “maximal” model (correlated varying intercept and slopes for subjects) assuming a normal likelihood. Base your priors in the priors discussed in section 4.1.
fit_pupil <-brm(p_size ~1+ c_load + (1+c_load | subj), data = df_pupil_complete,family =gaussian(),prior =c(prior(normal(1000, 500), class = Intercept),prior(normal(0, 1000), class = sigma),prior(normal(0, 100), class = b, coef = c_load),prior(normal(0, 1000), class = sd),prior(lkj(2), class = cor) ) )
Running /Library/Frameworks/R.framework/Resources/bin/R CMD SHLIB foo.c
clang -arch arm64 -I"/Library/Frameworks/R.framework/Resources/include" -DNDEBUG -I"/Users/danielapalleschi/Library/Caches/org.R-project.R/R/renv/cache/v5/R-4.2/aarch64-apple-darwin20/Rcpp/1.0.10/e749cae40fa9ef469b6050959517453c/Rcpp/include/" -I"/Users/danielapalleschi/Documents/R/Bayesian/Nicenboim_2023/renv/library/R-4.2/aarch64-apple-darwin20/RcppEigen/include/" -I"/Users/danielapalleschi/Documents/R/Bayesian/Nicenboim_2023/renv/library/R-4.2/aarch64-apple-darwin20/RcppEigen/include/unsupported" -I"/Users/danielapalleschi/Documents/R/Bayesian/Nicenboim_2023/renv/library/R-4.2/aarch64-apple-darwin20/BH/include" -I"/Users/danielapalleschi/Documents/R/Bayesian/Nicenboim_2023/renv/library/R-4.2/aarch64-apple-darwin20/StanHeaders/include/src/" -I"/Users/danielapalleschi/Documents/R/Bayesian/Nicenboim_2023/renv/library/R-4.2/aarch64-apple-darwin20/StanHeaders/include/" -I"/Users/danielapalleschi/Library/Caches/org.R-project.R/R/renv/cache/v5/R-4.2/aarch64-apple-darwin20/RcppParallel/5.1.7/a45594a00f5dbb073d5ec9f48592a08a/RcppParallel/include/" -I"/Users/danielapalleschi/Documents/R/Bayesian/Nicenboim_2023/renv/library/R-4.2/aarch64-apple-darwin20/rstan/include" -DEIGEN_NO_DEBUG -DBOOST_DISABLE_ASSERTS -DBOOST_PENDING_INTEGER_LOG2_HPP -DSTAN_THREADS -DUSE_STANC3 -DSTRICT_R_HEADERS -DBOOST_PHOENIX_NO_VARIADIC_EXPRESSION -DBOOST_NO_AUTO_PTR -include '/Users/danielapalleschi/Documents/R/Bayesian/Nicenboim_2023/renv/library/R-4.2/aarch64-apple-darwin20/StanHeaders/include/stan/math/prim/fun/Eigen.hpp' -D_REENTRANT -DRCPP_PARALLEL_USE_TBB=1 -I/opt/R/arm64/include -fPIC -falign-functions=64 -Wall -g -O2 -c foo.c -o foo.o
In file included from <built-in>:1:
In file included from /Users/danielapalleschi/Documents/R/Bayesian/Nicenboim_2023/renv/library/R-4.2/aarch64-apple-darwin20/StanHeaders/include/stan/math/prim/fun/Eigen.hpp:22:
In file included from /Users/danielapalleschi/Documents/R/Bayesian/Nicenboim_2023/renv/library/R-4.2/aarch64-apple-darwin20/RcppEigen/include/Eigen/Dense:1:
In file included from /Users/danielapalleschi/Documents/R/Bayesian/Nicenboim_2023/renv/library/R-4.2/aarch64-apple-darwin20/RcppEigen/include/Eigen/Core:88:
/Users/danielapalleschi/Documents/R/Bayesian/Nicenboim_2023/renv/library/R-4.2/aarch64-apple-darwin20/RcppEigen/include/Eigen/src/Core/util/Macros.h:628:1: error: unknown type name 'namespace'
namespace Eigen {
^
/Users/danielapalleschi/Documents/R/Bayesian/Nicenboim_2023/renv/library/R-4.2/aarch64-apple-darwin20/RcppEigen/include/Eigen/src/Core/util/Macros.h:628:16: error: expected ';' after top level declarator
namespace Eigen {
^
;
In file included from <built-in>:1:
In file included from /Users/danielapalleschi/Documents/R/Bayesian/Nicenboim_2023/renv/library/R-4.2/aarch64-apple-darwin20/StanHeaders/include/stan/math/prim/fun/Eigen.hpp:22:
In file included from /Users/danielapalleschi/Documents/R/Bayesian/Nicenboim_2023/renv/library/R-4.2/aarch64-apple-darwin20/RcppEigen/include/Eigen/Dense:1:
/Users/danielapalleschi/Documents/R/Bayesian/Nicenboim_2023/renv/library/R-4.2/aarch64-apple-darwin20/RcppEigen/include/Eigen/Core:96:10: fatal error: 'complex' file not found
#include <complex>
^~~~~~~~~
3 errors generated.
make: *** [foo.o] Error 1
Examine the effect of load on pupil size, and the average pupil size. What do you conclude?
My solution
fit_pupil
Family: gaussian
Links: mu = identity; sigma = identity
Formula: p_size ~ 1 + c_load + (1 + c_load | subj)
Data: df_pupil_complete (Number of observations: 2228)
Draws: 4 chains, each with iter = 2000; warmup = 1000; thin = 1;
total post-warmup draws = 4000
Group-Level Effects:
~subj (Number of levels: 20)
Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS
sd(Intercept) 3323.30 444.56 2565.32 4260.54 1.01 699
sd(c_load) 71.66 15.14 47.66 108.08 1.01 1190
cor(Intercept,c_load) 0.32 0.23 -0.19 0.71 1.00 1658
Tail_ESS
sd(Intercept) 1385
sd(c_load) 1701
cor(Intercept,c_load) 2185
Population-Level Effects:
Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
Intercept 2476.78 505.19 1463.96 3437.33 1.01 601 911
c_load 38.15 24.12 -12.19 83.16 1.01 1350 2211
Family Specific Parameters:
Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
sigma 505.23 7.74 490.67 520.21 1.00 5962 2704
Draws were sampled using sampling(NUTS). For each parameter, Bulk_ESS
and Tail_ESS are effective sample size measures, and Rhat is the potential
scale reduction factor on split chains (at convergence, Rhat = 1).
## For the hierarchical model is more complicated, # because we want the effect (beta) + adjustment: # we extract the overall group level effect:beta <-c(as_draws_df(fit_pupil_2)$b_c_load)# We extract the individual adjustmentsind_effects_v <-paste0("r_subj[", unique(df_pupil_complete$subj), ",c_load]") adjustment <-as.matrix(as_draws_df(fit_pupil)[ind_effects_v])# We get the by subject effects in a data frame where each adjustment # is in each column.by_subj_effect <-as_tibble(beta + adjustment)# We summarize them by getting a table with the mean and the# quantiles for each column and then binding them.par_h <-lapply(by_subj_effect, function(x) {tibble(Estimate =mean(x),Q2.5 =quantile(x, .025),Q97.5 =quantile(x, .975) )}) %>%bind_rows() %>%# We add a column to identify that the model, # and one with the subject labels:mutate(subj =unique(df_pupil_complete$subj)) %>%arrange(Estimate) %>%mutate(subj =factor(subj, levels = subj))ggplot(par_h,aes(ymin = Q2.5,ymax = Q97.5,x = subj,y = Estimate )) +geom_errorbar() +geom_point() +# We'll also add the mean and 95% CrI of the overall difference # to the plot:geom_hline(yintercept =posterior_summary(fit_pupil_2)["b_c_load", "Estimate"]) +geom_hline(yintercept =posterior_summary(fit_pupil_2)["b_c_load", "Q2.5"],linetype ="dotted",linewidth = .5 ) +geom_hline(yintercept =posterior_summary(fit_pupil_2)["b_c_load", "Q97.5"],linetype ="dotted",linewidth = .5 ) +coord_flip() +ylab("Change in pupil size") +xlab("Participant ID") +theme_bw()
1.2 Exercise 5.2 Are subject relatives easier to process than object relatives (log-normal likelihood)?
We begin with a classic question from the psycholinguistics literature: Are subject relatives easier to process than object relatives? The data come from Experiment 1 in a paper by Grodner and Gibson (2005).
Scientific question: Is there a subject relative advantage in reading?
Grodner and Gibson (2005) investigate an old claim in psycholinguistics that object relative clause (ORC) sentences are more difficult to process than subject relative clause (SRC) sentences. One explanation for this predicted difference is that the distance between the relative clause verb (sent in the example below) and the head noun phrase of the relative clause (reporter in the example below) is longer in ORC vs. SRC. Examples are shown below. The relative clause is shown in square brackets.
(1a) The reporter [who the photographer sent to the editor] was hoping for a good story. (ORC)
(1b) The reporter [who sent the photographer to the editor] was hoping for a good story. (SRC)
The underlying explanation has to do with memory processes: Shorter linguistic dependencies are easier to process due to either reduced interference or decay, or both. For implemented computational models that spell this point out, see Lewis and Vasishth (2005) and Engelmann, Jäger, and Vasishth (2020).
In the Grodner and Gibson data, the dependent measure is reading time at the relative clause verb, (e.g., sent) of different sentences with either ORC or SRC. The dependent variable is in milliseconds and was measured in a self-paced reading task. Self-paced reading is a task where subjects read a sentence or a short text word-by-word or phrase-by-phrase, pressing a button to get each word or phrase displayed; the preceding word disappears every time the button is pressed. In 6.1, we provide a more detailed explanation of this experimental method.
For this experiment, we are expecting longer reading times at the relative clause verbs of ORC sentences in comparison to the relative clause verb of SRC sentences.
Estimate the median difference between relative clause attachment sites in milliseconds, and report the mean and 95% CI.
My solution
alpha <-as_draws_df(fit_df_gg05_rc)$b_Interceptbeta <-as_draws_df(fit_df_gg05_rc)$b_condition1# Difference between object RC coded as .5 and subject RC coded as .5 effect <-exp(alpha + beta * .5) -exp(alpha + beta *-.5)c(mean =mean(effect), quantile(effect, c(.025,.975)))
mean 2.5% 97.5%
36.025032 4.942583 69.163421
Do a sensitivity analysis. What is the estimate of the effect (\(\beta\)) under different priors? What is the difference in milliseconds between conditions under different priors?
My solution
first let’s check out the fit of the model
pp_check(fit_df_gg05_rc, ndraws =50, type ="dens_overlay")
now by participants
ppc_dens_overlay_grouped(df_gg05_rc$RT,yrep =posterior_predict(fit_df_gg05_rc,ndraws =100 ),group = df_gg05_rc$subj) +xlab("Signal in the N400 spatiotemporal window")
we see lots of variation in estimates as a function of our priors
with the tight priors, there is a lot more uncertainty
with the regularised and wider priors the effects are a bit more similar
# these are all the intercept, i'm interested in the slope though :/ggpubr::ggarrange(pp_check(fit_df_gg05_rc_wide, type ="stat") +theme_bw() +ggtitle("Wide priors N(0,1)"),pp_check(fit_df_gg05_rc_tight, type ="stat") +theme_bw() +ggtitle("Tight priors N(0,.01)"),pp_check(fit_df_gg05_rc, type ="stat") +theme_bw() +ggtitle("Original priors N(0,.1)"),nrow =3)
1.3 Exercise 5.3 Relative clause processing in Mandarin Chinese
Load the following two data sets:
data("df_gibsonwu")data("df_gibsonwu2")
The data are taken from two experiments that investigate (inter alia) the effect of relative clause type on reading time in Chinese. The data are from Gibson and Wu (2013) and Vasishth et al. (2013) respectively. The second data set is a direct replication attempt of the Gibson and Wu (2013) experiment.
Chinese relative clauses are interesting theoretically because they are prenominal: the relative clause appears before the head noun. For example, the English relative clauses shown above would appear in the following order in Mandarin. The square brackets mark the relative clause, and REL refers to the Chinese equivalent of the English relative pronoun who.
(2a) [The photographer sent to the editor] REL the reporter was hoping for a good story. (ORC)
(2b) [sent the photographer to the editor] REL the reporter who was hoping for a good story. (SRC)
As discussed in Gibson and Wu (2013), the consequence of Chinese relative clauses being prenominal is that the distance between the verb in relative clause and the head noun is larger in subject relatives than object relatives. Hsiao and Gibson (2003) were the first to suggest that the larger distance in subject relatives leads to longer reading time at the head noun. Under this view, the prediction is that subject relatives are harder to process than object relatives. If this is true, this is interesting and surprising because in most other languages that have been studied, subject relatives are easier to process than object relatives; so Chinese will be a very unusual exception cross-linguistically.
The data provided are for the critical region (the head noun; here, reporter). The experiment method is self-paced reading, so we have reading times in milliseconds. The second data set is a direct replication attempt of the first data set, which is from Gibson and Wu (2013).
The research hypothesis is whether the difference in reading times between object and subject relative clauses is negative. For the first data set (df_gibsonwu), investigate this question by fitting two “maximal” hierarchical models (correlated varying intercept and slopes for subjects and items). The dependent variable in both models is the raw reading time in milliseconds. The first model should use the normal likelihood in the model; the second model should use the log-normal likelihood. In both models, use \(\pm 0.5\) sum coding to model the effect of relative clause type. You will need to decide on appropriate priors for the various parameters.
priors_gw_norm <-c(set_prior("normal(500, 150)", class ="Intercept"),set_prior("normal(0,500)", class ="b", coef ="type1"),set_prior("normal(0,500)", class ="sd"),set_prior("normal(0,1000)",class ="sigma"),set_prior("lkj(2)", class ="cor"))
fit model
fit_gw_norm <-brm(rt ~1+ type + (1+ type | subj) + (1+ type | item), data = df_gibsonwu,family =gaussian(),prior = priors_gw_norm)
Log-normal likelihood
set priors
priors_gw_log <-c(set_prior("normal(6, 1.5)", class ="Intercept"),set_prior("normal(0,1)", class ="b", coef ="type1"),set_prior("normal(0,1)",class ="sd"),set_prior("normal(0,1)",class ="sigma"),set_prior("lkj(2)", class ="cor"))
fit model
fit_gw_log <-brm(rt ~1+ type + (1+ type | subj) + (1+ type | item), data = df_gibsonwu,family =lognormal(),prior = priors_gw_log)
Plot the posterior predictive distributions from the two models. What is the difference in the posterior predictive distributions of the two models; and why is there a difference?
95% credible interval for estimates include 0 for both norm and log-norm distributions
log-norm model CrI is tighter, and includes smaller effect size (but includes 0)
Given the posterior predictive distributions you plotted above, why is the log-normal likelihood model better for carrying out inference and hypothesis testing?
Tip
# look at range per typedf_gibsonwu %>%group_by(type) %>%summarise(min(rt), max(rt))
# A tibble: 2 × 3
type `min(rt)` `max(rt)`
<fct> <int> <int>
1 obj-ext 172 2308
2 subj-ext 189 6217
subj-ext had extreme raw values; this would’ve biased the model with a normal distribution
Next, work out a normal approximation of the log-normal model’s posterior distribution for the relative clause effect that you obtained from the above data analysis. Then use that normal approximation as an informative prior for the slope parameter when fitting a hierarchical model to the second data set. This is an example of incrementally building up knowledge by successively using a previous study’s posterior as a prior for the next study; this is essentially equivalent to pooling both data sets (check that pooling the data and using a Normal(0,1) prior for the effect of interest, with a log-normal likelihood, gives you approximately the same posterior as the informative-prior model fit above).
Dataset 1 log estimates
first check 2nd dataset
head(df_gibsonwu2)
subj item condition pos rt region
9 1m1 15 obj-ext 8 832 head noun
20 1m1 8 subj-ext 8 2131 head noun
33 1m1 11 obj-ext 8 553 head noun
46 1m1 10 subj-ext 8 1091 head noun
62 1m1 16 subj-ext 8 598 head noun
75 1m1 14 subj-ext 8 645 head noun
remind ourselves of the estimates from the first dataset
summary(fit_gw_log)
Family: lognormal
Links: mu = identity; sigma = identity
Formula: rt ~ 1 + type + (1 + type | subj) + (1 + type | item)
Data: df_gibsonwu (Number of observations: 547)
Draws: 4 chains, each with iter = 2000; warmup = 1000; thin = 1;
total post-warmup draws = 4000
Group-Level Effects:
~item (Number of levels: 15)
Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS
sd(Intercept) 0.20 0.05 0.12 0.33 1.00 1171
sd(type1) 0.07 0.06 0.00 0.22 1.00 1560
cor(Intercept,type1) 0.00 0.41 -0.76 0.77 1.00 3490
Tail_ESS
sd(Intercept) 1975
sd(type1) 1432
cor(Intercept,type1) 2153
~subj (Number of levels: 37)
Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS
sd(Intercept) 0.26 0.04 0.19 0.34 1.00 1593
sd(type1) 0.14 0.07 0.01 0.28 1.00 995
cor(Intercept,type1) -0.46 0.31 -0.91 0.30 1.00 2120
Tail_ESS
sd(Intercept) 2399
sd(type1) 747
cor(Intercept,type1) 1730
Population-Level Effects:
Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
Intercept 6.07 0.07 5.93 6.21 1.00 1059 1635
type1 -0.07 0.06 -0.19 0.04 1.00 2775 2496
Family Specific Parameters:
Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
sigma 0.51 0.02 0.48 0.55 1.00 3727 2942
Draws were sampled using sampling(NUTS). For each parameter, Bulk_ESS
and Tail_ESS are effective sample size measures, and Rhat is the potential
scale reduction factor on split chains (at convergence, Rhat = 1).
for replication study, use:
type: \(LogNormal(-0.07, 0.07)\) (type1 Estimate and Est. Error?)
Intercept: \(LogNormal(6, 0.06)\)
Dataset 2 model
priors_gw2_log <-c(set_prior("normal(6, 0.07)", class ="Intercept"),set_prior("normal(-0.7,0.06)", class ="b", coef ="type1"),set_prior("normal(0,1)",class ="sd"),set_prior("normal(0,1)",class ="sigma"),set_prior("lkj(2)", class ="cor"))
fit model; convergence warning (ESS too low)
fit_gw2_log <-brm(rt ~1+ type + (1+ type | subj) + (1+ type | item), data = df_gibsonwu2,family =lognormal(),prior = priors_gw2_log)
Warning: Bulk Effective Samples Size (ESS) is too low, indicating posterior means and medians may be unreliable.
Running the chains for more iterations may help. See
https://mc-stan.org/misc/warnings.html#bulk-ess
summary(fit_gw2_log)
Family: lognormal
Links: mu = identity; sigma = identity
Formula: rt ~ 1 + type + (1 + type | subj) + (1 + type | item)
Data: df_gibsonwu2 (Number of observations: 595)
Draws: 4 chains, each with iter = 2000; warmup = 1000; thin = 1;
total post-warmup draws = 4000
Group-Level Effects:
~item (Number of levels: 15)
Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS
sd(Intercept) 0.17 0.04 0.10 0.26 1.00 1465
sd(type1) 0.60 0.14 0.37 0.92 1.00 1100
cor(Intercept,type1) -0.27 0.33 -0.81 0.43 1.02 222
Tail_ESS
sd(Intercept) 2660
sd(type1) 1818
cor(Intercept,type1) 749
~subj (Number of levels: 40)
Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS
sd(Intercept) 0.24 0.04 0.18 0.32 1.00 1166
sd(type1) 0.11 0.06 0.01 0.23 1.00 925
cor(Intercept,type1) 0.03 0.35 -0.67 0.70 1.00 3838
Tail_ESS
sd(Intercept) 2179
sd(type1) 1580
cor(Intercept,type1) 2349
Population-Level Effects:
Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
Intercept 6.03 0.05 5.92 6.13 1.00 502 1134
type1 -0.61 0.06 -0.74 -0.49 1.00 2945 2338
Family Specific Parameters:
Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
sigma 0.43 0.01 0.40 0.46 1.00 3838 2955
Draws were sampled using sampling(NUTS). For each parameter, Bulk_ESS
and Tail_ESS are effective sample size measures, and Rhat is the potential
scale reduction factor on split chains (at convergence, Rhat = 1).
priors_gw12_log <-c(set_prior("normal(6, 0.07)", class ="Intercept"),set_prior("normal(-0.7,0.06)", class ="b", coef ="type1"),set_prior("normal(0,1)",class ="sd"),set_prior("normal(0,1)",class ="sigma"),set_prior("lkj(2)", class ="cor"))
fit_gw12_log <-brm(rt ~1+ type + (1+ type | subj) + (1+ type | item), data = df_gw12,family =lognormal(),prior = priors_gw12_log)
summary(fit_gw12_log)
Family: lognormal
Links: mu = identity; sigma = identity
Formula: rt ~ 1 + type + (1 + type | subj) + (1 + type | item)
Data: df_gw12 (Number of observations: 1142)
Draws: 4 chains, each with iter = 2000; warmup = 1000; thin = 1;
total post-warmup draws = 4000
Group-Level Effects:
~item (Number of levels: 30)
Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS
sd(Intercept) 0.18 0.03 0.12 0.25 1.00 1382
sd(type1) 0.44 0.10 0.27 0.66 1.00 824
cor(Intercept,type1) -0.12 0.28 -0.62 0.44 1.01 406
Tail_ESS
sd(Intercept) 1980
sd(type1) 999
cor(Intercept,type1) 809
~subj (Number of levels: 77)
Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS
sd(Intercept) 0.24 0.03 0.19 0.30 1.00 1573
sd(type1) 0.12 0.05 0.01 0.22 1.00 826
cor(Intercept,type1) -0.33 0.27 -0.81 0.25 1.00 3284
Tail_ESS
sd(Intercept) 2804
sd(type1) 1232
cor(Intercept,type1) 2090
Population-Level Effects:
Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
Intercept 6.04 0.05 5.94 6.14 1.01 605 1378
type1 -0.49 0.07 -0.63 -0.35 1.00 1151 949
Family Specific Parameters:
Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
sigma 0.47 0.01 0.45 0.49 1.00 3722 2772
Draws were sampled using sampling(NUTS). For each parameter, Bulk_ESS
and Tail_ESS are effective sample size measures, and Rhat is the potential
scale reduction factor on split chains (at convergence, Rhat = 1).
priors_gw12_log_reg <-c(set_prior("normal(6, 1.5)", class ="Intercept"),set_prior("normal(0,1)", class ="b", coef ="type1"),set_prior("normal(0,1)",class ="sd"),set_prior("normal(0,1)",class ="sigma"),set_prior("lkj(2)", class ="cor"))
fit_gw12_log_reg <-brm(rt ~1+ type + (1+ type | subj) + (1+ type | item), data = df_gw12,family =lognormal(),prior = priors_gw12_log_reg)
summary(fit_gw12_log_reg)
Family: lognormal
Links: mu = identity; sigma = identity
Formula: rt ~ 1 + type + (1 + type | subj) + (1 + type | item)
Data: df_gw12 (Number of observations: 1142)
Draws: 4 chains, each with iter = 2000; warmup = 1000; thin = 1;
total post-warmup draws = 4000
Group-Level Effects:
~item (Number of levels: 30)
Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS
sd(Intercept) 0.17 0.03 0.12 0.24 1.00 1401
sd(type1) 0.11 0.05 0.01 0.20 1.01 1090
cor(Intercept,type1) -0.28 0.30 -0.81 0.37 1.00 2573
Tail_ESS
sd(Intercept) 2208
sd(type1) 1218
cor(Intercept,type1) 2349
~subj (Number of levels: 77)
Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS
sd(Intercept) 0.25 0.03 0.20 0.30 1.00 1293
sd(type1) 0.11 0.05 0.01 0.20 1.00 931
cor(Intercept,type1) -0.35 0.29 -0.84 0.33 1.00 2930
Tail_ESS
sd(Intercept) 1901
sd(type1) 845
cor(Intercept,type1) 2057
Population-Level Effects:
Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
Intercept 6.03 0.04 5.94 6.12 1.00 1060 1388
type1 -0.08 0.04 -0.15 -0.00 1.00 2966 2881
Family Specific Parameters:
Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
sigma 0.47 0.01 0.45 0.49 1.00 3574 3142
Draws were sampled using sampling(NUTS). For each parameter, Bulk_ESS
and Tail_ESS are effective sample size measures, and Rhat is the potential
scale reduction factor on split chains (at convergence, Rhat = 1).
prior_summary(fit_gw12_log_reg)
prior class coef group resp dpar nlpar lb ub
(flat) b
normal(0,1) b type1
normal(6, 1.5) Intercept
lkj_corr_cholesky(2) L
lkj_corr_cholesky(2) L item
lkj_corr_cholesky(2) L subj
normal(0,1) sd 0
normal(0,1) sd item 0
normal(0,1) sd Intercept item 0
normal(0,1) sd type1 item 0
normal(0,1) sd subj 0
normal(0,1) sd Intercept subj 0
normal(0,1) sd type1 subj 0
normal(0,1) sigma 0
source
default
user
user
user
(vectorized)
(vectorized)
user
(vectorized)
(vectorized)
(vectorized)
(vectorized)
(vectorized)
(vectorized)
user
The data are taken from an experiment that investigate (inter alia) the effect of number similarity between a noun and the auxiliary verb in sentences like the following. There are two levels to a factor called Int(erference): low and high.
(3a) low: The key to the cabinet are on the table (3b) high: The key to the cabinets are on the table
Here, in (3b), the auxiliary verb are is predicted to be read faster than in (3a), because the plural marking on the noun cabinets leads the reader to think that the sentence is grammatical. (Both sentences are ungrammatical.) This phenomenon, where the high condition is read faster than the low condition, is called agreement attraction.
The data provided are for the critical region (the auxiliary verb are). The experiment method is eye-tracking; we have total reading times in milliseconds.
The research question is whether the difference in reading times between high and low conditions is negative.
First, using a log-normal likelihood, fit a hierarchical model with correlated varying intercept and slopes for subjects and items. You will need to decide on the priors for the model.
By simply looking at the posterior distribution of the slope parameter, \(\beta\), what would you conclude about the theoretical claim relating to agreement attraction?
1.5 Exercise 5.5
1.6 Exercise 5.6
1.7 Exercise 5.7
1.8 Exercise 5.8
1.9 Quarto
Quarto enables you to weave together content and executable code into a finished presentation. To learn more about Quarto presentations see https://quarto.org/docs/presentations/.
1.10 Bullets
When you click the Render button a document will be generated that includes:
Content authored with markdown
Output from executable code
1.11 Code
When you click the Render button a presentation will be generated that includes both content and the output of embedded code. You can embed code like this:
1+1
[1] 2
Source Code
---title: "Ch. Exercises"author: "Daniela Palleschi"format: html: toc: true # html-math-method: katex self-contained: true css: styles.css number-sections: true number-depth: 3 code-overflow: wrap code-tools: trueeditor_options: chunk_output_type: console# bibliography: references.jsonbiblio-style: apalike---# Set options {-}```{r setup, message = F}#| code-fold: true# set global knit optionsknitr::opts_chunk$set(echo = T,eval = T,error = F,warning = F,message = F,cache = T)# suppress scientific notationoptions(scipen=999)# list of required packagespackages <-c( #"SIN", # this package was removed from the CRAN repository"MASS", "dplyr", "tidyr", "purrr", "extraDistr", "ggplot2", "loo", "bridgesampling", "brms", "bayesplot", "tictoc", "hypr", "bcogsci", "papaja", "grid", "kableExtra", "gridExtra", "lme4", "cowplot", "pdftools", "cmdstanr", "rootSolve", "rstan" )# NB: if you haven't already installed bcogsci through devtools, it won't be loaded## Now load or install & load allpackage.check <-lapply( packages,FUN =function(x) {if (!require(x, character.only =TRUE)) {install.packages(x, dependencies =TRUE)library(x, character.only =TRUE) } })# this is also required, taken from the textbook## Save compiled models:rstan_options(auto_write =FALSE)## Parallelize the chains using all the cores:options(mc.cores = parallel::detectCores())# To solve some conflicts between packagesselect <- dplyr::selectextract <- rstan::extract```# Chapter 5 - Bayesian hierachical models## Exercise 5.1 A hierarchical model (normal likelihood) of cognitive load on pupil .As in section 4.1, we focus on the effect of cognitive load on pupil , but this time we look at all the subjects of Wahn et al. (2016):```{r}data("df_pupil_complete")df_pupil_complete```You should be able to now fit a “maximal” model (correlated varying intercept and slopes for subjects) assuming a normal likelihood. Base your priors in the priors discussed in section 4.1.:::{.callout-tip}### Maximal model- first centre the predictor```{r}df_pupil_complete <- df_pupil_complete %>%mutate(c_load = load-mean(load))```- then run the model```{r, message=F, error=T, warning=T}fit_pupil <-brm(p_size ~1+ c_load + (1+c_load | subj), data = df_pupil_complete,family =gaussian(),prior =c(prior(normal(1000, 500), class = Intercept),prior(normal(0, 1000), class = sigma),prior(normal(0, 100), class = b, coef = c_load),prior(normal(0, 1000), class = sd),prior(lkj(2), class = cor) ) )```:::a. Examine the effect of load on pupil size, and the average pupil size. What do you conclude?:::{.callout-tip}#### My solution```{r}fit_pupil```- 95% CrI crosses 0: quite a bit of uncertainty- alternatively, to only print `c_load````{r}posterior_summary(fit_pupil, variable ="b_c_load")```- but the intercept is quite large:```{r}posterior_summary(fit_pupil, variable ="b_Intercept")```- even though we assumed it wouldn't be```{r}brms::prior_summary(fit_pupil)```- overly informative prior for the intercept?:::b. Do a sensitivity analysis for the prior on the intercept ($\alpha$). What is the estimate of the effect ($\beta$) under different priors?:::{.callout-tip}#### My solution- try a *wider* prior for $\alpha$; I'm choosing 3000 and 1000ms$$\alpha \sim Normal(3000,1000)$$```{r}fit_pupil_2 <-brm(p_size ~1+ c_load + (1+ c_load | subj), data = df_pupil_complete,family =gaussian(),prior =c(prior(normal(3000, 1000), class = Intercept),prior(normal(0, 1000), class = sigma),prior(normal(0, 100), class = b, coef = c_load),prior(normal(0, 1000), class = sd),prior(lkj(2), class = cor) ) )``````{r}posterior_summary(fit_pupil_2, variable ="b_c_load")```- the effect of `c_load` now ha more certainty```{r}posterior_summary(fit_pupil, variable ="b_Intercept")```:::c. Is the effect of load consistent across subjects? Investigate this visually.:::{.callout-tip}#### My solution- check out overall effect; there are 3 peaks```{r}pp_check(fit_pupil_2, ndraws =50, type ="dens_overlay")```- now by participants```{r}ppc_dens_overlay_grouped(df_pupil_complete$p_size,yrep =posterior_predict(fit_pupil_2,ndraws =100 ),group = df_pupil_complete$subj) +xlab("Signal in the N400 spatiotemporal window")```- and by sd```{r}pp_check(fit_pupil_2,type ="stat_grouped",ndraws =1000,group ="subj",stat ="sd")```- and also:```{r}## For the hierarchical model is more complicated, # because we want the effect (beta) + adjustment: # we extract the overall group level effect:beta <-c(as_draws_df(fit_pupil_2)$b_c_load)# We extract the individual adjustmentsind_effects_v <-paste0("r_subj[", unique(df_pupil_complete$subj), ",c_load]") adjustment <-as.matrix(as_draws_df(fit_pupil)[ind_effects_v])# We get the by subject effects in a data frame where each adjustment # is in each column.by_subj_effect <-as_tibble(beta + adjustment)# We summarize them by getting a table with the mean and the# quantiles for each column and then binding them.par_h <-lapply(by_subj_effect, function(x) {tibble(Estimate =mean(x),Q2.5 =quantile(x, .025),Q97.5 =quantile(x, .975) )}) %>%bind_rows() %>%# We add a column to identify that the model, # and one with the subject labels:mutate(subj =unique(df_pupil_complete$subj)) %>%arrange(Estimate) %>%mutate(subj =factor(subj, levels = subj))ggplot(par_h,aes(ymin = Q2.5,ymax = Q97.5,x = subj,y = Estimate )) +geom_errorbar() +geom_point() +# We'll also add the mean and 95% CrI of the overall difference # to the plot:geom_hline(yintercept =posterior_summary(fit_pupil_2)["b_c_load", "Estimate"]) +geom_hline(yintercept =posterior_summary(fit_pupil_2)["b_c_load", "Q2.5"],linetype ="dotted",linewidth = .5 ) +geom_hline(yintercept =posterior_summary(fit_pupil_2)["b_c_load", "Q97.5"],linetype ="dotted",linewidth = .5 ) +coord_flip() +ylab("Change in pupil size") +xlab("Participant ID") +theme_bw()```:::## Exercise 5.2 Are subject relatives easier to process than object relatives (log-normal likelihood)?We begin with a classic question from the psycholinguistics literature: Are subject relatives easier to process than object relatives? The data come from Experiment 1 in a paper by Grodner and Gibson (2005).Scientific question: Is there a subject relative advantage in reading?Grodner and Gibson (2005) investigate an old claim in psycholinguistics that object relative clause (ORC) sentences are more difficult to process than subject relative clause (SRC) sentences. One explanation for this predicted difference is that the distance between the relative clause verb (sent in the example below) and the head noun phrase of the relative clause (reporter in the example below) is longer in ORC vs. SRC. Examples are shown below. The relative clause is shown in square brackets.(1a) The reporter [who the photographer sent to the editor] was hoping for a good story. (ORC)(1b) The reporter [who sent the photographer to the editor] was hoping for a good story. (SRC)The underlying explanation has to do with memory processes: Shorter linguistic dependencies are easier to process due to either reduced interference or decay, or both. For implemented computational models that spell this point out, see Lewis and Vasishth (2005) and Engelmann, Jäger, and Vasishth (2020).In the Grodner and Gibson data, the dependent measure is reading time at the relative clause verb, (e.g., sent) of different sentences with either ORC or SRC. The dependent variable is in milliseconds and was measured in a self-paced reading task. Self-paced reading is a task where subjects read a sentence or a short text word-by-word or phrase-by-phrase, pressing a button to get each word or phrase displayed; the preceding word disappears every time the button is pressed. In 6.1, we provide a more detailed explanation of this experimental method.For this experiment, we are expecting longer reading times at the relative clause verbs of ORC sentences in comparison to the relative clause verb of SRC sentences.```{r}data("df_gg05_rc")df_gg05_rc```You should use a sum coding for the predictors. Here, object relative clauses ("objgaps") are coded +1, subject relative clauses -1.:::{.callout-tip}# Set contrasts```{r}df_gg05_rc$condition <-factor(df_gg05_rc$condition, levels =c("subjgap","objgap"))class(df_gg05_rc$condition)contrasts(df_gg05_rc$condition) <-c(-.5,+.5)contrasts(df_gg05_rc$condition)```- or, as in the book:```{r}df_gg05_rc <- df_gg05_rc %>%mutate(c_cond =if_else(condition =="objgap", .5, -.5))```:::You should be able to now fit a “maximal” model (correlated varying intercept and slopes for subjects and for items) assuming a log-normal likelihood.:::{.callout-tip}#### Maximal model```{r}fit_df_gg05_rc <-brm( RT ~ condition + (condition | subj) + (condition | item),family =lognormal(),prior =c(prior(normal(6, 1.5), class = Intercept),prior(normal(0, .1), class = b),prior(normal(0, 1), class = sigma),prior(normal(0, 1), class = sd),prior(lkj(2), class = cor) ),data = df_gg05_rc )fit_df_gg05_rc```:::a. Examine the effect of relative clause attachment site (the predictor `c_cond`) on reading times RT ($\beta$).:::{.callout-tip}##### My solution- the effect of RC attachment on RT on the log scale is ```{r}posterior_summary(fit_df_gg05_rc, variable ="b_condition1")```:::b. Estimate the median difference between relative clause attachment sites in milliseconds, and report the mean and 95% CI.:::{.callout-tip}##### My solution```{r}alpha <-as_draws_df(fit_df_gg05_rc)$b_Interceptbeta <-as_draws_df(fit_df_gg05_rc)$b_condition1# Difference between object RC coded as .5 and subject RC coded as .5 effect <-exp(alpha + beta * .5) -exp(alpha + beta *-.5)c(mean =mean(effect), quantile(effect, c(.025,.975)))```:::c. Do a sensitivity analysis. What is the estimate of the effect ($\beta$) under different priors? What is the difference in milliseconds between conditions under different priors?:::{.callout-tip}##### My solution- first let's check out the fit of the model```{r}pp_check(fit_df_gg05_rc, ndraws =50, type ="dens_overlay")```- now by participants```{r}ppc_dens_overlay_grouped(df_gg05_rc$RT,yrep =posterior_predict(fit_df_gg05_rc,ndraws =100 ),group = df_gg05_rc$subj) +xlab("Signal in the N400 spatiotemporal window")```- and by sd```{r}pp_check(fit_df_gg05_rc,type ="stat_grouped",ndraws =1000,group ="subj",stat ="sd")``````{r}posterior_summary(fit_df_gg05_rc, variable ="b_condition1")``````{r}posterior_summary(fit_df_gg05_rc, variable ="b_Intercept")```- our posteriors are pretty close to our priors```{r}brms::prior_summary(fit_df_gg05_rc)```- let's see if this is due to the influence of our priors. Let's start with *narrower* prior$$\beta \sim Normal(0,.01)$$```{r}fit_df_gg05_rc_tight <-brm( RT ~ condition + (condition | subj) + (condition | item),family =lognormal(),prior =c(prior(normal(6, 1.5), class = Intercept),prior(normal(0, .01), class = b),prior(normal(0, 1), class = sigma),prior(normal(0, 1), class = sd),prior(lkj(2), class = cor) ),data = df_gg05_rc )fit_df_gg05_rc_tight``````{r}posterior_summary(fit_df_gg05_rc_tight, variable ="b_condition1")```- and now a *wider* prior$$\beta \sim Normal(0,1)$$```{r}fit_df_gg05_rc_wide <-brm( RT ~ condition + (condition | subj) + (condition | item),family =lognormal(),prior =c(prior(normal(6, 1.5), class = Intercept),prior(normal(0, 1), class = b),prior(normal(0, 1), class = sigma),prior(normal(0, 1), class = sd),prior(lkj(2), class = cor) ),data = df_gg05_rc )fit_df_gg05_rc_wide``````{r}posterior_summary(fit_df_gg05_rc_wide, variable ="b_condition1")``````{r}prior <-data.frame(posterior_summary(fit_df_gg05_rc, variable ="b_condition1")) %>%mutate("Prior for $\\beta$"="Normal(0,.1)")tight <-data.frame(posterior_summary(fit_df_gg05_rc_tight, variable ="b_condition1")) %>%mutate("Prior for $\\beta$"="Normal(0,.01)")wide <-data.frame(posterior_summary(fit_df_gg05_rc_wide, variable ="b_condition1")) %>%mutate("Prior for $\\beta$"="Normal(0,1)")sens_priors <-rbind(prior,tight,wide)brms::prior_summary(fit_df_gg05_rc_wide)sens_priors %>%kbl() %>%kable_styling()```- and now in milliseconds```{r}alpha1 <-as_draws_df(fit_df_gg05_rc)$b_Intercept beta1 <-as_draws_df(fit_df_gg05_rc)$b_condition1diff1 <-exp(alpha1 + beta1/2) -exp(alpha1 - beta1/2)prior_ms <-data.frame(mean =mean(diff1), quantile(diff1, .025), quantile(diff1, .975), row.names =NULL) %>%rename("Estimate (ms)"="mean","2.5%"="quantile.diff1..0.025.","97.5%"="quantile.diff1..0.975.")%>%mutate("Prior for $\\beta$"="Normal(0,.1)")alpha1 <-as_draws_df(fit_df_gg05_rc_tight)$b_Intercept beta1 <-as_draws_df(fit_df_gg05_rc_tight)$b_condition1diff1 <-exp(alpha1 + beta1/2) -exp(alpha1 - beta1/2)tight_ms <-data.frame(mean =mean(diff1), quantile(diff1, .025), quantile(diff1, .975), row.names =NULL) %>%rename("Estimate (ms)"="mean","2.5%"="quantile.diff1..0.025.","97.5%"="quantile.diff1..0.975.")%>%mutate("Prior for $\\beta$"="Normal(0,.01)")alpha1 <-as_draws_df(fit_df_gg05_rc_wide)$b_Intercept beta1 <-as_draws_df(fit_df_gg05_rc_wide)$b_condition1diff1 <-exp(alpha1 + beta1/2) -exp(alpha1 - beta1/2)wide_ms <-data.frame(mean =mean(diff1), quantile(diff1, .025), quantile(diff1, .975), row.names =NULL) %>%rename("Estimate (ms)"="mean","2.5%"="quantile.diff1..0.025.","97.5%"="quantile.diff1..0.975.") %>%mutate("Prior for $\\beta$"="Normal(0,1)")priors_ms <-rbind(prior_ms,tight_ms,wide_ms) %>%mutate(effect ="b_condition1") %>%relocate(effect, .before="Estimate (ms)")priors_ms %>%kbl() %>%kable_styling()```- we see lots of variation in estimates as a function of our priors + with the tight priors, there is a lot more uncertainty + with the regularised and wider priors the effects are a bit more similar```{r}# these are all the intercept, i'm interested in the slope though :/ggpubr::ggarrange(pp_check(fit_df_gg05_rc_wide, type ="stat") +theme_bw() +ggtitle("Wide priors N(0,1)"),pp_check(fit_df_gg05_rc_tight, type ="stat") +theme_bw() +ggtitle("Tight priors N(0,.01)"),pp_check(fit_df_gg05_rc, type ="stat") +theme_bw() +ggtitle("Original priors N(0,.1)"),nrow =3)```:::## Exercise 5.3 Relative clause processing in Mandarin ChineseLoad the following two data sets:```{r}data("df_gibsonwu")data("df_gibsonwu2")```The data are taken from two experiments that investigate (inter alia) the effect of relative clause type on reading time in Chinese. The data are from Gibson and Wu (2013) and Vasishth et al. (2013) respectively. The second data set is a direct replication attempt of the Gibson and Wu (2013) experiment.Chinese relative clauses are interesting theoretically because they are prenominal: the relative clause appears before the head noun. For example, the English relative clauses shown above would appear in the following order in Mandarin. The square brackets mark the relative clause, and REL refers to the Chinese equivalent of the English relative pronoun who.(2a) [The photographer sent to the editor] REL the reporter was hoping for a good story. (ORC)(2b) [sent the photographer to the editor] REL the reporter who was hoping for a good story. (SRC)As discussed in Gibson and Wu (2013), the consequence of Chinese relative clauses being prenominal is that the distance between the verb in relative clause and the head noun is larger in subject relatives than object relatives. Hsiao and Gibson (2003) were the first to suggest that the larger distance in subject relatives leads to longer reading time at the head noun. Under this view, the prediction is that subject relatives are harder to process than object relatives. If this is true, this is interesting and surprising because in most other languages that have been studied, subject relatives are easier to process than object relatives; so Chinese will be a very unusual exception cross-linguistically.The data provided are for the critical region (the head noun; here, reporter). The experiment method is self-paced reading, so we have reading times in milliseconds. The second data set is a direct replication attempt of the first data set, which is from Gibson and Wu (2013).The research hypothesis is whether the difference in reading times between object and subject relative clauses is negative. For the first data set (`df_gibsonwu`), investigate this question by fitting two “maximal” hierarchical models (correlated varying intercept and slopes for subjects and items). The dependent variable in both models is the raw reading time in milliseconds. The first model should use the normal likelihood in the model; the second model should use the log-normal likelihood. In both models, use $\pm 0.5$ sum coding to model the effect of relative clause type. You will need to decide on appropriate priors for the various parameters.:::{.callout-tip}#### Contrasts- `obj-ext = -0.5`, `subj-ext = 0.5````{r}head(df_gibsonwu)df_gibsonwu$type <-factor(df_gibsonwu$type, levels =c("obj-ext","subj-ext"))contrasts(df_gibsonwu$type) <-c(0.5,-0.5)contrasts(df_gibsonwu$type)```:::::::{.callout-tip}#### Normal likelihood- first set priors```{r}priors_gw_norm <-c(set_prior("normal(500, 150)", class ="Intercept"),set_prior("normal(0,500)", class ="b", coef ="type1"),set_prior("normal(0,500)", class ="sd"),set_prior("normal(0,1000)",class ="sigma"),set_prior("lkj(2)", class ="cor"))```- fit model```{r, message = F, error=TRUE, warning=TRUE}fit_gw_norm <-brm(rt ~1+ type + (1+ type | subj) + (1+ type | item), data = df_gibsonwu,family =gaussian(),prior = priors_gw_norm)```::::::{.callout-tip}#### Log-normal likelihood- set priors```{r}priors_gw_log <-c(set_prior("normal(6, 1.5)", class ="Intercept"),set_prior("normal(0,1)", class ="b", coef ="type1"),set_prior("normal(0,1)",class ="sd"),set_prior("normal(0,1)",class ="sigma"),set_prior("lkj(2)", class ="cor"))```- fit model```{r, message = F, error=TRUE, warning=TRUE}fit_gw_log <-brm(rt ~1+ type + (1+ type | subj) + (1+ type | item), data = df_gibsonwu,family =lognormal(),prior = priors_gw_log)```:::a. Plot the posterior predictive distributions from the two models. What is the difference in the posterior predictive distributions of the two models; and why is there a difference?:::{.callout-tip}##### Posterior predictive distributions```{r}ggpubr::ggarrange(pp_check(fit_gw_norm, ndraws =1000) +ggtitle("Normal distr") +theme(legend.position ="none"),pp_check(fit_gw_log, ndraws =1000) +ggtitle("Log-normal distr") +theme(legend.position ="none"), cowplot::get_legend(pp_check(fit_gw_log) +theme(legend.position ="bottom")),ncol =1,heights =c(.45,.45,.1))```- quite a mismatch in normal likelihood model, e.g., negative values are generated- log-normal better fit:::b. Examine the posterior distributions of the effect estimates (in milliseconds) in the two models. Why are these different?:::{.callout-tip}##### Posterior distribution ```{r}# Normal distrplot(fit_gw_norm)``````{r}# Log distrplot(fit_gw_log)```##### Effect estimates- look at effect estimates```{r}# normal distrgw_intercept_norm <-as_draws_df(fit_gw_norm)$b_Interceptgw_slope_norm <-as_draws_df(fit_gw_norm)$b_type1gw_RT_diff_norm <- gw_slope_normround(quantile(gw_RT_diff_norm,prob=c(0.025,0.975)),2)``````{r}# log distrgw_intercept_log <-as_draws_df(fit_gw_log)$b_Interceptgw_slope_log <-as_draws_df(fit_gw_log)$b_type1gw_RT_diff_log <-exp(gw_intercept_log + gw_slope_log/2) -exp(gw_intercept_log - gw_slope_log/2)quantile(gw_RT_diff_log,prob=c(0.025,0.975))``````{r}boxplot(rt~type,df_gibsonwu)```- 95% credible interval for estimates include 0 for both norm and log-norm distributions- log-norm model CrI is tighter, and includes smaller effect size (but includes 0):::c. Given the posterior predictive distributions you plotted above, why is the log-normal likelihood model better for carrying out inference and hypothesis testing?:::{.callout-tip}```{r}# look at range per typedf_gibsonwu %>%group_by(type) %>%summarise(min(rt), max(rt))``````{r}library(ggrain) df_gibsonwu %>%# filter(session!="bi") %>%# mutate(px_list = paste0(participant,list)) %>%ggplot(data = ., aes(x = type, y = rt, fill = type, color = type, shape = type)) +labs(title ="Distribution of observations") +geom_rain(alpha = .5, rain.side ='f1x1',violin.args =list(color =NA, alpha = .5)) +theme_bw() +scale_fill_manual(values=c("dodgerblue", "darkorange")) +scale_color_manual(values=c("dodgerblue", "darkorange")) ```#### My answer- subj-ext had extreme raw values; this would've biased the model with a normal distribution:::Next, work out a normal approximation of the log-normal model’s posterior distribution for the relative clause effect that you obtained from the above data analysis. Then use that normal approximation as an informative prior for the slope parameter when fitting a hierarchical model to the second data set. This is an example of incrementally building up knowledge by successively using a previous study’s posterior as a prior for the next study; this is essentially equivalent to pooling both data sets (check that pooling the data and using a Normal(0,1) prior for the effect of interest, with a log-normal likelihood, gives you approximately the same posterior as the informative-prior model fit above).:::{.callout-tip}#### Dataset 1 log estimates- first check 2nd dataset```{r}head(df_gibsonwu2)df_gibsonwu2 <- df_gibsonwu2 %>%rename("type"= condition)df_gibsonwu2$type <-factor(df_gibsonwu2$type, levels =c("obj-ext","subj-ext"))contrasts(df_gibsonwu2$type) <-c(0.5,-0.5)contrasts(df_gibsonwu2$type)```- remind ourselves of the estimates from the first dataset```{r}summary(fit_gw_log)```- for replication study, use: + type: $LogNormal(-0.07, 0.07)$ (`type1` Estimate and Est. Error?) + Intercept: $LogNormal(6, 0.06)$#### Dataset 2 model```{r}priors_gw2_log <-c(set_prior("normal(6, 0.07)", class ="Intercept"),set_prior("normal(-0.7,0.06)", class ="b", coef ="type1"),set_prior("normal(0,1)",class ="sd"),set_prior("normal(0,1)",class ="sigma"),set_prior("lkj(2)", class ="cor"))```- fit model; convergence warning (ESS too low)```{r, message = F, error=TRUE, warning=TRUE}fit_gw2_log <-brm(rt ~1+ type + (1+ type | subj) + (1+ type | item), data = df_gibsonwu2,family =lognormal(),prior = priors_gw2_log)``````{r}summary(fit_gw2_log)``````{r}plot(fit_gw2_log)```##### Estimates```{r}# log distrgw2_intercept_log <-as_draws_df(fit_gw2_log)$b_Interceptgw2_slope_log <-as_draws_df(fit_gw2_log)$b_type1gw2_RT_diff_log <-exp(gw2_intercept_log + gw2_slope_log/2) -exp(gw2_intercept_log - gw2_slope_log/2)quantile(gw2_RT_diff_log,prob=c(0.025,0.975))```::::::{.callout-tip}### Datasets 1 and 2```{r}# make sure 2 datasets have same column names/ordernames(df_gibsonwu); names(df_gibsonwu2)df_gibsonwu2 <- df_gibsonwu2 %>%select(subj,item,type,rt) %>%mutate(subj =paste0(subj,"_gw2"),item =paste0(item,"_gw2"))df_gibsonwu <- df_gibsonwu %>%mutate(subj =paste0(subj,"_gw1"),item =paste0(item,"_gw1"))names(df_gibsonwu) ==names(df_gibsonwu2)# combinedf_gw12 <-rbind(df_gibsonwu,df_gibsonwu2)```##### Informative model```{r}# contrastshead(df_gw12)df_gw12$type <-factor(df_gw12$type, levels =c("obj-ext","subj-ext"))contrasts(df_gw12$type) <-c(0.5,-0.5)contrasts(df_gw12$type)```- same priors as based on previous models```{r}priors_gw12_log <-c(set_prior("normal(6, 0.07)", class ="Intercept"),set_prior("normal(-0.7,0.06)", class ="b", coef ="type1"),set_prior("normal(0,1)",class ="sd"),set_prior("normal(0,1)",class ="sigma"),set_prior("lkj(2)", class ="cor"))``````{r, message = F, error=TRUE, warning=TRUE}fit_gw12_log <-brm(rt ~1+ type + (1+ type | subj) + (1+ type | item), data = df_gw12,family =lognormal(),prior = priors_gw12_log)``````{r}summary(fit_gw12_log)``````{r}plot(fit_gw12_log)```##### Estimates```{r}# log distrgw12_intercept_log <-as_draws_df(fit_gw12_log)$b_Interceptgw12_slope_log <-as_draws_df(fit_gw12_log)$b_type1gw12_RT_diff_log <-exp(gw12_intercept_log + gw12_slope_log/2) -exp(gw12_intercept_log - gw12_slope_log/2)quantile(gw12_RT_diff_log,prob=c(0.025,0.975))```- negative slope, CrI doesn't include 0#### Regularised prior?```{r}priors_gw12_log_reg <-c(set_prior("normal(6, 1.5)", class ="Intercept"),set_prior("normal(0,1)", class ="b", coef ="type1"),set_prior("normal(0,1)",class ="sd"),set_prior("normal(0,1)",class ="sigma"),set_prior("lkj(2)", class ="cor"))``````{r, message = F, error=TRUE, warning=TRUE}fit_gw12_log_reg <-brm(rt ~1+ type + (1+ type | subj) + (1+ type | item), data = df_gw12,family =lognormal(),prior = priors_gw12_log_reg)``````{r}summary(fit_gw12_log_reg)prior_summary(fit_gw12_log_reg)``````{r}plot(fit_gw12_log_reg)```##### Estimates```{r}# log distrgw12_intercept_log_reg <-as_draws_df(fit_gw12_log_reg)$b_Interceptgw12_slope_log_reg <-as_draws_df(fit_gw12_log_reg)$b_type1gw12_RT_diff_log_reg <-exp(gw12_intercept_log_reg + gw12_slope_log_reg/2) -exp(gw12_intercept_log_reg - gw12_slope_log_reg/2)quantile(gw12_RT_diff_log_reg,prob=c(0.025,0.975))```- now the 95% CrI does not include 0, but the range of values is much tighter and includes smaller values:::## Exercise 5.4 - Agreement attraction in comprehensionLoad the following data:```{r}data("df_dillonE1")dillonE1 <- df_dillonE1head(dillonE1)```The data are taken from an experiment that investigate (inter alia) the effect of number similarity between a noun and the auxiliary verb in sentences like the following. There are two levels to a factor called Int(erference): low and high.(3a) *low*: The key to the cabinet are on the table (3b) *high*: The key to the cabinets are on the tableHere, in (3b), the auxiliary verb are is predicted to be read faster than in (3a), because the plural marking on the noun cabinets leads the reader to think that the sentence is grammatical. (Both sentences are ungrammatical.) This phenomenon, where the high condition is read faster than the low condition, is called **agreement attraction**.The data provided are for the critical region (the auxiliary verb are). The experiment method is eye-tracking; we have total reading times in milliseconds.The research question is whether the difference in reading times between high and low conditions is negative.- First, using a log-normal likelihood, fit a hierarchical model with correlated varying intercept and slopes for subjects and items. You will need to decide on the priors for the model.- By simply looking at the posterior distribution of the slope parameter, $\beta$, what would you conclude about the theoretical claim relating to agreement attraction?## Exercise 5.5## Exercise 5.6## Exercise 5.7## Exercise 5.8## QuartoQuarto enables you to weave together content and executable code into a finished presentation. To learn more about Quarto presentations see <https://quarto.org/docs/presentations/>.## BulletsWhen you click the **Render** button a document will be generated that includes:- Content authored with markdown- Output from executable code## CodeWhen you click the **Render** button a presentation will be generated that includes both content and the output of embedded code. You can embed code like this:```{r}1+1```